Load data, prepared fpkm values and ortholog annonation

datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))

Set columns names

colnames(rawCounts) <- datasets$setname
rownames(rawCounts) <- geneDetails$human_name 
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name 

Filter out lower 30% and mitochondrial genes

rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9 
##    30% 
## 2947.9
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,])) 
filteredRawCounts <- rawCounts[filteredNames,]

Normalize data accounting for GC content

GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)

Depth normalize using TMM scaling factors

origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
  normalizedColSums[i] <- origColSums[i]* normFactors[i]
  i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
 filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
 i <- i+1
 }

Normalize using log transforming

logTransformedDepthNormCounts <- log2(filteredDepthNormCounts+1)

Apply Harmony method to correct batch effect

meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
harmonized_pcs <- HarmonyMatrix(
  data_mat  = logTransformedDepthNormCounts,       
  meta_data = meta,
  vars_use  = "seqBatch",
)
## Harmony 1/10
## Harmony converged after 1 iterations
plotData_pca <- prcomp(harmonized_pcs[, -1])

Check PCA statistics

summary(plotData_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5    PC6     PC7
## Standard deviation     0.3195 0.2753 0.2512 0.23087 0.22044 0.1873 0.16456
## Proportion of Variance 0.1794 0.1333 0.1109 0.09372 0.08544 0.0617 0.04761
## Cumulative Proportion  0.1794 0.3127 0.4236 0.51735 0.60279 0.6645 0.71209
##                            PC8     PC9    PC10    PC11    PC12    PC13   PC14
## Standard deviation     0.15842 0.14771 0.14139 0.13924 0.12729 0.12227 0.1171
## Proportion of Variance 0.04413 0.03836 0.03515 0.03409 0.02849 0.02628 0.0241
## Cumulative Proportion  0.75622 0.79458 0.82973 0.86381 0.89230 0.91858 0.9427
##                           PC15    PC16    PC17    PC18    PC19
## Standard deviation     0.11384 0.10432 0.06797 0.05103 0.03915
## Proportion of Variance 0.02279 0.01913 0.00812 0.00458 0.00270
## Cumulative Proportion  0.96547 0.98460 0.99273 0.99730 1.00000

Transfer PCA data to plots

plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- harmonized_pcs[,1]
plotData$PC2 <- harmonized_pcs[,2]
plotData$PC3 <- harmonized_pcs[,3]

Plot the first and the second principal components

Plot the first and the second principal components with centroids

Plot the first, the second and the third principal components

Test for significance of correlations between the matched tissues PC values of human and mouse

cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson") 
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC1[1:13] and plotData$PC1[14:26]
## t = 1.3797, df = 11, p-value = 0.1951
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2117066  0.7717468
## sample estimates:
##       cor 
## 0.3840807
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC2[1:13] and plotData$PC2[14:26]
## t = 2.1433, df = 11, p-value = 0.05529
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01174541  0.84195285
## sample estimates:
##       cor 
## 0.5427523
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson") 
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC3[1:13] and plotData$PC3[14:26]
## t = 3.957, df = 11, p-value = 0.002246
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3728563 0.9262503
## sample estimates:
##       cor 
## 0.7663947